This notebook will begin looking at clustering methods on the expression of the genes in a single sample of the dataset of interest, from an unbiased approach.

Set Up

# Load libraries
library(magrittr)
library(scater)
library(readr)
library(bluster)
library(ggpubr)
library(pheatmap)

# Set file paths
data_dir <- file.path("results", "Gawad_processed_data")

# Source custom functions script
source(file.path("utils", "clustering-functions.R"))

Read in data

sample_290_normalized <- read_rds(
  file.path(data_dir, "SCPCS000216", "SCPCL000290_miQC_processed_sce.rds"))

Perform clustering

k-means

# Perform k-means clustering
sample_290_normalized <- kmeans_clustering(
  sample_290_normalized,
  params_range = c(4:10),
  step_size = 1
)

# grab column names with clustering results
kmeans_cols <- grep("kmeans", colnames(colData(sample_290_normalized)))
kmeans_cluster_names <- colnames(colData(sample_290_normalized)[, kmeans_cols])

# Plot k-means
kmeans_plot_list <- kmeans_cluster_names %>%
  purrr::map(~ plotReducedDim(sample_290_normalized, dimred = "UMAP", colour_by = .x) + 
               theme_bw()+
               theme(text = element_text(size = 22)))

cowplot::plot_grid(plotlist = kmeans_plot_list, ncol = 4)

graph-based, walktrap

# Perform graph-based walktrap clustering
sample_290_normalized <- graph_clustering(
  sample_290_normalized,
  params_range = c(5:25),
  step_size = 5,
  weighting_type = "rank",
  cluster_function = "walktrap"
)

# grab column names with clustering results
walktrap_cols <- grep("walktrap", colnames(colData(sample_290_normalized)))
walktrap_cluster_names <- colnames(colData(sample_290_normalized)[, walktrap_cols])

# Plot
walktrap_plot_list <- walktrap_cluster_names %>%
   purrr::map(~ plotReducedDim(sample_290_normalized, dimred = "UMAP", colour_by = .x) + 
                theme_bw() +
                theme(text = element_text(size = 22)))

cowplot::plot_grid(plotlist = walktrap_plot_list, ncol = 3)

graph-based, louvain

# Perform graph-based louvain clustering
sample_290_normalized <- graph_clustering(
  sample_290_normalized,
  params_range = c(5:25),
  step_size = 5,
  weighting_type = "jaccard",
  cluster_function = "louvain"
)

# grab column names with clustering results
louvain_cols <- grep("walktrap", colnames(colData(sample_290_normalized)))
louvain_cluster_names <- colnames(colData(sample_290_normalized)[, louvain_cols])

# Plot
louvain_plot_list <- louvain_cluster_names %>%
   purrr::map(~ plotReducedDim(sample_290_normalized, dimred = "UMAP", colour_by = .x) + 
                theme_bw() +
                theme(text = element_text(size = 22)))


cowplot::plot_grid(plotlist = louvain_plot_list, ncol = 3)

Check cluster validity stats

k-means

# Check the k-means cluster validity stats for each of the clusters and return
# stats in a data frame
kmeans_stats_df <- create_metadata_stats_df(sample_290_normalized, c(4:10), 1, "kmeans")
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
# Preview the results
head(kmeans_stats_df)

# Summarize the stats and return in a data frame
kmeans_summary_stats_df <- summarize_clustering_stats(kmeans_stats_df)
`summarise()` has grouped output by 'cluster_names_column', 'cluster_type'. You can override using the `.groups` argument.
# Preview the summary results
head(kmeans_summary_stats_df)

purity plots

# Plot individual cluster purity stats
kmeans_purity_plots <- plot_cluster_purity(kmeans_stats_df)

kmeans_purity_plots

silhouette width plots

# Plot individual cluster silhouette width stats
kmeans_silhouette_plots <- plot_cluster_silhouette_width(kmeans_stats_df)

kmeans_silhouette_plots

graph-based, walktrap

# Check the walktrap cluster validity stats for each of the clusters and return
# stats in a data frame
walktrap_stats_df <- create_metadata_stats_df(sample_290_normalized, c(5:25), 5, "walktrap")
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
  
# Preview the all stats results
head(walktrap_stats_df)

# Summarize the stats and return in a data frame
walktrap_summary_stats_df <- summarize_clustering_stats(walktrap_stats_df)
`summarise()` has grouped output by 'cluster_names_column', 'cluster_type'. You can override using the `.groups` argument.
# Preview the summary results
head(walktrap_summary_stats_df)

purity plots

# Plot individual cluster purity stats
walktrap_purity_plots <- plot_cluster_purity(walktrap_stats_df)

walktrap_purity_plots

silhouette width plots

# Plot individual cluster silhouette width stats
walktrap_silhouette_plots <- plot_cluster_silhouette_width(walktrap_stats_df)

walktrap_silhouette_plots

graph-based, louvain

# Check the louvain cluster validity stats for each of the clusters and return
# stats in a data frame
louvain_stats_df <- create_metadata_stats_df(sample_290_normalized, c(5:25), 5, "louvain")
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
Joining, by = "cell_barcode"
# Preview the results
head(louvain_stats_df)

# Summarize the stats and return in a data frame
louvain_summary_stats_df <- summarize_clustering_stats(louvain_stats_df)
`summarise()` has grouped output by 'cluster_names_column', 'cluster_type'. You can override using the `.groups` argument.
# Preview the summary results
head(louvain_summary_stats_df)

purity plots

# Plot individual cluster purity stats
louvain_purity_plots <- plot_cluster_purity(louvain_stats_df)

louvain_purity_plots

silhouette width plots

# Plot individual cluster silhouette width stats
louvain_silhouette_plots <- plot_cluster_silhouette_width(louvain_stats_df)

louvain_silhouette_plots

Summary plots

summary_stats_df_list <- list("walktrap" = walktrap_summary_stats_df,
                              "louvain" = louvain_summary_stats_df)

# purity summary plot
plot_avg_validity_stats(summary_stats_df_list, "avg_purity")


#silhouette width summary plot
plot_avg_validity_stats(summary_stats_df_list, "avg_width")

Check cluster stability

k-means

# Check cluster stability
kmeans_ari_df <- get_cluster_stability_summary(sample_290_normalized, c(4:10), 1, "kmeans")
kmeans_ari_df
# plot cluster stability ARI values
plot_cluster_stability_ari(kmeans_ari_df)
Warning: Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

graph-based, walktrap

walktrap_ari_df <- get_cluster_stability_summary(sample_290_normalized, c(5:25), 5, cluster_type = "graph", cluster_function = "walktrap")
walktrap_ari_df
# plot cluster stability ARI values
plot_cluster_stability_ari(walktrap_ari_df)
Warning: Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

graph-based, louvain

louvain_ari_df <- get_cluster_stability_summary(sample_290_normalized, c(5:25), 5, cluster_type = "graph", cluster_function = "louvain")
louvain_ari_df
# plot cluster stability ARI values
plot_cluster_stability_ari(louvain_ari_df)
Warning: Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

Cluster stability summary plots

summary_ari_df_list <- list("walktrap" = walktrap_ari_df,
                            "louvain" = louvain_ari_df)

ari_summary_df <- summarize_cluster_stability_ari(summary_ari_df_list)
`summarise()` has grouped output by 'cluster_names_column', 'cluster_type'. You can override using the `.groups` argument.
ari_summary_df

Session info

sessionInfo()
R version 4.1.2 (2021-11-01)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 18.04.3 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/openblas/libblas.so.3
LAPACK: /usr/lib/x86_64-linux-gnu/libopenblasp-r0.2.20.so

locale:
 [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8        LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8   
 [6] LC_MESSAGES=C.UTF-8    LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C           LC_TELEPHONE=C        
[11] LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] pheatmap_1.0.12             ggpubr_0.4.0                bluster_1.4.0               readr_2.1.1                
 [5] scater_1.22.0               ggplot2_3.3.5               scuttle_1.4.0               SingleCellExperiment_1.16.0
 [9] SummarizedExperiment_1.24.0 Biobase_2.54.0              GenomicRanges_1.46.1        GenomeInfoDb_1.30.0        
[13] IRanges_2.28.0              S4Vectors_0.32.3            BiocGenerics_0.40.0         MatrixGenerics_1.6.0       
[17] matrixStats_0.61.0          magrittr_2.0.1             

loaded via a namespace (and not attached):
 [1] bitops_1.0-7              RColorBrewer_1.1-2        tools_4.1.2               backports_1.4.0          
 [5] utf8_1.2.2                R6_2.5.1                  irlba_2.3.3               vipor_0.4.5              
 [9] DBI_1.1.1                 colorspace_2.0-2          nnet_7.3-17               withr_2.4.3              
[13] tidyselect_1.1.1          gridExtra_2.3             compiler_4.1.2            cli_3.1.0                
[17] BiocNeighbors_1.12.0      DelayedArray_0.20.0       labeling_0.4.2            scales_1.1.1             
[21] digest_0.6.29             rmarkdown_2.11            XVector_0.34.0            pkgconfig_2.0.3          
[25] htmltools_0.5.2           sparseMatrixStats_1.6.0   fastmap_1.1.0             rlang_0.4.12             
[29] rstudioapi_0.13           DelayedMatrixStats_1.16.0 farver_2.1.0              generics_0.1.1           
[33] BiocParallel_1.28.2       dplyr_1.0.7               car_3.0-12                RCurl_1.98-1.5           
[37] BiocSingular_1.10.0       modeltools_0.2-23         GenomeInfoDbData_1.2.7    Matrix_1.4-1             
[41] Rcpp_1.0.7                ggbeeswarm_0.6.0          munsell_0.5.0             fansi_0.5.0              
[45] abind_1.4-5               viridis_0.6.2             lifecycle_1.0.1           yaml_2.2.1               
[49] carData_3.0-4             MASS_7.3-57               zlibbioc_1.40.0           flexmix_2.3-17           
[53] grid_4.1.2                parallel_4.1.2            ggrepel_0.9.1             crayon_1.4.2             
[57] lattice_0.20-45           cowplot_1.1.1             beachmat_2.10.0           splines_4.1.2            
[61] hms_1.1.1                 knitr_1.36                pillar_1.6.4              igraph_1.2.9             
[65] ggsignif_0.6.3            ScaledMatrix_1.2.0        magic_1.6-0               pdfCluster_1.0-3         
[69] glue_1.5.1                evaluate_0.14             renv_0.14.0               tweenr_1.0.2             
[73] vctrs_0.3.8               tzdb_0.2.0                polyclip_1.10-0           gtable_0.3.0             
[77] purrr_0.3.4               tidyr_1.1.4               assertthat_0.2.1          ggforce_0.3.3            
[81] xfun_0.28                 rsvd_1.0.5                broom_0.7.10              miQC_1.2.0               
[85] rstatix_0.7.0             viridisLite_0.4.0         geometry_0.4.6            tibble_3.1.6             
[89] beeswarm_0.4.0            cluster_2.1.3             ellipsis_0.3.2           
---
title: "Clustering"
author: "Data Lab for ALSF"
date: "2022"
output:
  html_notebook:
    toc: yes
    toc_float: yes
  html_document:
    toc: yes
    df_print: paged
---

This notebook will begin looking at clustering methods on the expression of the genes in a single sample of the dataset of interest, from an unbiased approach.

## Set Up 

```{r}
# Load libraries
library(magrittr)
library(scater)
library(readr)
library(bluster)
library(ggpubr)
library(pheatmap)

# Set file paths
data_dir <- file.path("results", "Gawad_processed_data")

# Source custom functions script
source(file.path("utils", "clustering-functions.R"))
```

## Read in data

```{r}
sample_290_normalized <- read_rds(
  file.path(data_dir, "SCPCS000216", "SCPCL000290_miQC_processed_sce.rds"))
```

## Perform clustering

### k-means

```{r fig.height = 10.5}
# Perform k-means clustering
sample_290_normalized <- kmeans_clustering(
  sample_290_normalized,
  params_range = c(4:10),
  step_size = 1
)

# grab column names with clustering results
kmeans_cols <- grep("kmeans", colnames(colData(sample_290_normalized)))
kmeans_cluster_names <- colnames(colData(sample_290_normalized)[, kmeans_cols])

# Plot k-means
kmeans_plot_list <- kmeans_cluster_names %>%
  purrr::map(~ plotReducedDim(sample_290_normalized, dimred = "UMAP", colour_by = .x) + 
               theme_bw()+
               theme(text = element_text(size = 22)))

cowplot::plot_grid(plotlist = kmeans_plot_list, ncol = 4)
```

### graph-based, walktrap

```{r fig.height = 9.5}
# Perform graph-based walktrap clustering
sample_290_normalized <- graph_clustering(
  sample_290_normalized,
  params_range = c(5:25),
  step_size = 5,
  weighting_type = "rank",
  cluster_function = "walktrap"
)

# grab column names with clustering results
walktrap_cols <- grep("walktrap", colnames(colData(sample_290_normalized)))
walktrap_cluster_names <- colnames(colData(sample_290_normalized)[, walktrap_cols])

# Plot
walktrap_plot_list <- walktrap_cluster_names %>%
   purrr::map(~ plotReducedDim(sample_290_normalized, dimred = "UMAP", colour_by = .x) + 
                theme_bw() +
                theme(text = element_text(size = 22)))

cowplot::plot_grid(plotlist = walktrap_plot_list, ncol = 3)
```

### graph-based, louvain

```{r fig.height = 8.5}
# Perform graph-based louvain clustering
sample_290_normalized <- graph_clustering(
  sample_290_normalized,
  params_range = c(5:25),
  step_size = 5,
  weighting_type = "jaccard",
  cluster_function = "louvain"
)

# grab column names with clustering results
louvain_cols <- grep("walktrap", colnames(colData(sample_290_normalized)))
louvain_cluster_names <- colnames(colData(sample_290_normalized)[, louvain_cols])

# Plot
louvain_plot_list <- louvain_cluster_names %>%
   purrr::map(~ plotReducedDim(sample_290_normalized, dimred = "UMAP", colour_by = .x) + 
                theme_bw() +
                theme(text = element_text(size = 22)))


cowplot::plot_grid(plotlist = louvain_plot_list, ncol = 3)
```

## Check cluster validity stats

### k-means

```{r fig.height = 20.5}
# Check the k-means cluster validity stats for each of the clusters and return
# stats in a data frame
kmeans_stats_df <- create_metadata_stats_df(sample_290_normalized, c(4:10), 1, "kmeans")

# Preview the results
head(kmeans_stats_df)

# Summarize the stats and return in a data frame
kmeans_summary_stats_df <- summarize_clustering_stats(kmeans_stats_df)

# Preview the summary results
head(kmeans_summary_stats_df)
```

#### purity plots

```{r fig.height=10, fig.width=15}
# Plot individual cluster purity stats
kmeans_purity_plots <- plot_cluster_purity(kmeans_stats_df)

kmeans_purity_plots
```

#### silhouette width plots

```{r fig.height=10, fig.width=15}
# Plot individual cluster silhouette width stats
kmeans_silhouette_plots <- plot_cluster_silhouette_width(kmeans_stats_df)

kmeans_silhouette_plots
```

### graph-based, walktrap

```{r fig.height = 15.5}
# Check the walktrap cluster validity stats for each of the clusters and return
# stats in a data frame
walktrap_stats_df <- create_metadata_stats_df(sample_290_normalized, c(5:25), 5, "walktrap")
  
# Preview the all stats results
head(walktrap_stats_df)

# Summarize the stats and return in a data frame
walktrap_summary_stats_df <- summarize_clustering_stats(walktrap_stats_df)

# Preview the summary results
head(walktrap_summary_stats_df)
```

#### purity plots

```{r fig.height=10, fig.width=15}
# Plot individual cluster purity stats
walktrap_purity_plots <- plot_cluster_purity(walktrap_stats_df)

walktrap_purity_plots
```

#### silhouette width plots

```{r fig.height=10, fig.width=15}
# Plot individual cluster silhouette width stats
walktrap_silhouette_plots <- plot_cluster_silhouette_width(walktrap_stats_df)

walktrap_silhouette_plots
```

### graph-based, louvain

```{r fig.height = 9.5}
# Check the louvain cluster validity stats for each of the clusters and return
# stats in a data frame
louvain_stats_df <- create_metadata_stats_df(sample_290_normalized, c(5:25), 5, "louvain")

# Preview the results
head(louvain_stats_df)

# Summarize the stats and return in a data frame
louvain_summary_stats_df <- summarize_clustering_stats(louvain_stats_df)

# Preview the summary results
head(louvain_summary_stats_df)
```

#### purity plots

```{r fig.height=10}
# Plot individual cluster purity stats
louvain_purity_plots <- plot_cluster_purity(louvain_stats_df)

louvain_purity_plots
```

#### silhouette width plots

```{r fig.height=10}
# Plot individual cluster silhouette width stats
louvain_silhouette_plots <- plot_cluster_silhouette_width(louvain_stats_df)

louvain_silhouette_plots
```

### Summary plots

```{r}
summary_stats_df_list <- list("walktrap" = walktrap_summary_stats_df,
                              "louvain" = louvain_summary_stats_df)

# purity summary plot
plot_avg_validity_stats(summary_stats_df_list, "avg_purity")

#silhouette width summary plot
plot_avg_validity_stats(summary_stats_df_list, "avg_width")
```

## Check cluster stability

### k-means

```{r}
# Check cluster stability
kmeans_ari_df <- get_cluster_stability_summary(sample_290_normalized, c(4:10), 1, "kmeans")
kmeans_ari_df
```

```{r}
# plot cluster stability ARI values
plot_cluster_stability_ari(kmeans_ari_df)
```

### graph-based, walktrap

```{r warning=FALSE}
walktrap_ari_df <- get_cluster_stability_summary(sample_290_normalized, c(5:25), 5, cluster_type = "graph", cluster_function = "walktrap")
walktrap_ari_df
```

```{r}
# plot cluster stability ARI values
plot_cluster_stability_ari(walktrap_ari_df)
```

### graph-based, louvain

```{r warning=FALSE}
louvain_ari_df <- get_cluster_stability_summary(sample_290_normalized, c(5:25), 5, cluster_type = "graph", cluster_function = "louvain")
louvain_ari_df
```

```{r}
# plot cluster stability ARI values
plot_cluster_stability_ari(louvain_ari_df)
```

### Cluster stability summary plots

```{r}
summary_ari_df_list <- list("walktrap" = walktrap_ari_df,
                            "louvain" = louvain_ari_df)

ari_summary_df <- summarize_cluster_stability_ari(summary_ari_df_list)
ari_summary_df
```

## Session info

```{r}
sessionInfo()
```
